fuente: https://github.com/DiegoKoz/discursos_presidenciales

library(glue)

Attaching package: ‘glue’

The following object is masked from ‘package:dplyr’:

    collapse
df <- read_rds('data/discursos_presidenciales.rds')

df <- df %>% 
  mutate(texto = tolower(texto),
         texto = stri_trans_general(texto, "Latin-ASCII"),
         texto = str_trim(texto,side = 'both'),
         texto = str_replace_all(texto,'\t',' '),
         texto = str_replace_all(texto,'\n',' '),
         texto = str_replace_all(texto,'\r',' '),
         texto = str_replace_all(texto,'[[:punct:]]',' '),
         texto = str_replace_all(texto,'\\d','NUM'),
         texto = str_replace_all(texto,'(NUM)+','NUM'),
         texto = str_replace_all(texto,"\\s+", " "))

palabras_comunes <- read_csv(file = 'data/r_words.txt',col_names = F)
Parsed with column specification:
cols(
  X1 = col_character()
)
palabras_comunes <-stri_trans_general(palabras_comunes$X1, "Latin-ASCII")
palabras_comunes <- unique(palabras_comunes)

texto <- df$texto

Corpus = VCorpus(VectorSource(texto))
Corpus = tm_map(Corpus, removeWords, c(stopwords(kind = "es"),palabras_comunes))
# Corpus <- tm_map(Corpus, stemDocument, language = "spanish") # Corpus  

dtm <- DocumentTermMatrix(Corpus)
tm::nTerms(dtm)
[1] 19409
#elimino los docuemntos vacios
rowTotals <- apply(dtm , 1, sum)
nDocs(dtm)
[1] 603
dtm   <- dtm[rowTotals> 0, ]
nDocs(dtm)
[1] 602
lda_fit
A LDA_Gibbs topic model with 10 topics.
Terms <- terms(lda_fit, 10)
Terms
      Topic 1         Topic 2          Topic 3      Topic 4           Topic 5        Topic 6        Topic 7       Topic 8      Topic 9     
 [1,] "labor"         "organizaciones" "presidente" "pami"            "mechita"      "club"         "periodista"  "mar"        "elegir"    
 [2,] "pone"          "medico"         "paises"     "tecnopolis"      "julio"        "competir"     "justicia"    "admiracion" "alumna"    
 [3,] "fragata"       "afectando"      "mundo"      "declaracion"     "capaces"      "saladillo"    "informacion" "alguno"     "libremente"
 [4,] "satelite"      "complejas"      "macri"      "anteriores"      "nacion"       "clubes"       "congreso"    "vecina"     "recibirnos"
 [5,] "ayuden"        "habian"         "pais"       "deporte"         "enfrentamos"  "anteriores"   "provincias"  "colectivo"  "alumno"    
 [6,] "encabezar"     "ratificando"    "num"        "enormemente"     "enormes"      "construccion" "prensa"      "ensena"     "podes"     
 [7,] "entendimiento" "tiempos"        "ser"        "maravilloso"     "expresa"      "demostrarles" "publicos"    "fronteras"  "deporte"   
 [8,] "funcionar"     "trabajaba"      "desarrollo" "profesionalismo" "sentirnos"    "jubilados"    "respecto"    "colegio"    "dio"       
 [9,] "jesus"         "trabajemos"     "primer"     "vuelvo"          "acompanarnos" "llenos"       "reforma"     "comunidad"  "liderar"   
[10,] "juicio"        "cabe"           "anos"       "conoci"          "continente"   "resignar"     "fiscal"      "guerra"     "gana"      
      Topic 10    
 [1,] "num"       
 [2,] "argentinos"
 [3,] "pais"      
 [4,] "trabajo"   
 [5,] "anos"      
 [6,] "aca"       
 [7,] "mundo"     
 [8,] "ser"       
 [9,] "verdad"    
[10,] "juntos"    

Visualizacion

topicmodels_json_ldavis <- function(fitted, dtm){
    svd_tsne <- function(x) tsne(svd(x)$u)

    # Find required quantities
    phi <- as.matrix(posterior(fitted)$terms)
    theta <- as.matrix(posterior(fitted)$topics)
    vocab <- colnames(phi)
    term_freq <- slam::col_sums(dtm)

    # Convert to json
    json_lda <- LDAvis::createJSON(phi = phi, theta = theta,
                            vocab = vocab,
                            mds.method = svd_tsne,
                            plot.opts = list(xlab="", ylab=""),
                            doc.length = as.vector(table(dtm$i)),
                            term.frequency = term_freq)

    return(json_lda)
}
json_res <- topicmodels_json_ldavis(lda_fit, dtm)
sigma summary: Min. : 33554432 |1st Qu. : 33554432 |Median : 33554432 |Mean : 33554432 |3rd Qu. : 33554432 |Max. : 33554432 |
Epoch: Iteration #100 error is: 14.1744077047285
Epoch: Iteration #200 error is: 0.6642175441671
Epoch: Iteration #300 error is: 0.375647353159243
Epoch: Iteration #400 error is: 0.276097083111349
Epoch: Iteration #500 error is: 0.2622322268503
Epoch: Iteration #600 error is: 0.253897306321437
Epoch: Iteration #700 error is: 0.252719375064657
Epoch: Iteration #800 error is: 0.252652072110335
Epoch: Iteration #900 error is: 0.252626857162804
Epoch: Iteration #1000 error is: 0.252588385221556
serVis(json_res)
createTcpServer: address already in use
To stop the server, run servr::daemon_stop(2) or restart your R session
Serving the directory /tmp/RtmpVR2iwu/file15a266d88ad3 at http://127.0.0.1:3222

LS0tCnRpdGxlOiAiRGlzY3Vyc29zIFByZXNpZGVuY2lhbGVzIgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgpmdWVudGU6IGh0dHBzOi8vZ2l0aHViLmNvbS9EaWVnb0tvei9kaXNjdXJzb3NfcHJlc2lkZW5jaWFsZXMgCgpgYGB7cn0KbGlicmFyeSh0aWR5dmVyc2UpCmxpYnJhcnkoZ2x1ZSkKbGlicmFyeSh0bSkKbGlicmFyeSh0b3BpY21vZGVscykKbGlicmFyeSh0aWR5dGV4dCkKbGlicmFyeShzdHJpbmdpKQpsaWJyYXJ5KExEQXZpcykKbGlicmFyeShzbGFtKQpsaWJyYXJ5KHRzbmUpCmxpYnJhcnkobHVicmlkYXRlKQpgYGAKCmBgYHtyfQpkZiA8LSByZWFkX3JkcygnZGF0YS9kaXNjdXJzb3NfcHJlc2lkZW5jaWFsZXMucmRzJykKCmRmIDwtIGRmICU+JSAKICBtdXRhdGUodGV4dG8gPSB0b2xvd2VyKHRleHRvKSwKICAgICAgICAgdGV4dG8gPSBzdHJpX3RyYW5zX2dlbmVyYWwodGV4dG8sICJMYXRpbi1BU0NJSSIpLAogICAgICAgICB0ZXh0byA9IHN0cl90cmltKHRleHRvLHNpZGUgPSAnYm90aCcpLAogICAgICAgICB0ZXh0byA9IHN0cl9yZXBsYWNlX2FsbCh0ZXh0bywnXHQnLCcgJyksCiAgICAgICAgIHRleHRvID0gc3RyX3JlcGxhY2VfYWxsKHRleHRvLCdcbicsJyAnKSwKICAgICAgICAgdGV4dG8gPSBzdHJfcmVwbGFjZV9hbGwodGV4dG8sJ1xyJywnICcpLAogICAgICAgICB0ZXh0byA9IHN0cl9yZXBsYWNlX2FsbCh0ZXh0bywnW1s6cHVuY3Q6XV0nLCcgJyksCiAgICAgICAgIHRleHRvID0gc3RyX3JlcGxhY2VfYWxsKHRleHRvLCdcXGQnLCdOVU0nKSwKICAgICAgICAgdGV4dG8gPSBzdHJfcmVwbGFjZV9hbGwodGV4dG8sJyhOVU0pKycsJ05VTScpLAogICAgICAgICB0ZXh0byA9IHN0cl9yZXBsYWNlX2FsbCh0ZXh0bywiXFxzKyIsICIgIikpCgoKYGBgCgoKYGBge3J9CgpwYWxhYnJhc19jb211bmVzIDwtIHJlYWRfY3N2KGZpbGUgPSAnZGF0YS9yX3dvcmRzLnR4dCcsY29sX25hbWVzID0gRikKCnBhbGFicmFzX2NvbXVuZXMgPC1zdHJpX3RyYW5zX2dlbmVyYWwocGFsYWJyYXNfY29tdW5lcyRYMSwgIkxhdGluLUFTQ0lJIikKcGFsYWJyYXNfY29tdW5lcyA8LSB1bmlxdWUocGFsYWJyYXNfY29tdW5lcykKCnRleHRvIDwtIGRmJHRleHRvCgpDb3JwdXMgPSBWQ29ycHVzKFZlY3RvclNvdXJjZSh0ZXh0bykpCkNvcnB1cyA9IHRtX21hcChDb3JwdXMsIHJlbW92ZVdvcmRzLCBjKHN0b3B3b3JkcyhraW5kID0gImVzIikscGFsYWJyYXNfY29tdW5lcykpCiMgQ29ycHVzIDwtIHRtX21hcChDb3JwdXMsIHN0ZW1Eb2N1bWVudCwgbGFuZ3VhZ2UgPSAic3BhbmlzaCIpICMgQ29ycHVzICAKCmR0bSA8LSBEb2N1bWVudFRlcm1NYXRyaXgoQ29ycHVzKQp0bTo6blRlcm1zKGR0bSkKI2VsaW1pbm8gbG9zIGRvY3VlbW50b3MgdmFjaW9zCnJvd1RvdGFscyA8LSBhcHBseShkdG0gLCAxLCBzdW0pCm5Eb2NzKGR0bSkKZHRtICAgPC0gZHRtW3Jvd1RvdGFscz4gMCwgXQpuRG9jcyhkdG0pCmBgYAoKYGBge3J9CgpsZGFfZml0IDwtIExEQShkdG0sIGsgPSAxMCxtZXRob2QgPSAiR2liYnMiLCBjb250cm9sID0gbGlzdChkZWx0YT0wLjYsc2VlZCA9IDEyMzQpKQpsZGFfZml0CmBgYAoKYGBge3J9ClRlcm1zIDwtIHRlcm1zKGxkYV9maXQsIDEwKQpUZXJtcwpgYGAKCgoKVmlzdWFsaXphY2lvbgoKCgoKYGBge3J9CnRvcGljbW9kZWxzX2pzb25fbGRhdmlzIDwtIGZ1bmN0aW9uKGZpdHRlZCwgZHRtKXsKICAgIHN2ZF90c25lIDwtIGZ1bmN0aW9uKHgpIHRzbmUoc3ZkKHgpJHUpCgogICAgIyBGaW5kIHJlcXVpcmVkIHF1YW50aXRpZXMKICAgIHBoaSA8LSBhcy5tYXRyaXgocG9zdGVyaW9yKGZpdHRlZCkkdGVybXMpCiAgICB0aGV0YSA8LSBhcy5tYXRyaXgocG9zdGVyaW9yKGZpdHRlZCkkdG9waWNzKQogICAgdm9jYWIgPC0gY29sbmFtZXMocGhpKQogICAgdGVybV9mcmVxIDwtIHNsYW06OmNvbF9zdW1zKGR0bSkKCiAgICAjIENvbnZlcnQgdG8ganNvbgogICAganNvbl9sZGEgPC0gTERBdmlzOjpjcmVhdGVKU09OKHBoaSA9IHBoaSwgdGhldGEgPSB0aGV0YSwKICAgICAgICAgICAgICAgICAgICAgICAgICAgIHZvY2FiID0gdm9jYWIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBtZHMubWV0aG9kID0gc3ZkX3RzbmUsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICBwbG90Lm9wdHMgPSBsaXN0KHhsYWI9InRzbmUiLCB5bGFiPSIiKSwKICAgICAgICAgICAgICAgICAgICAgICAgICAgIGRvYy5sZW5ndGggPSBhcy52ZWN0b3IodGFibGUoZHRtJGkpKSwKICAgICAgICAgICAgICAgICAgICAgICAgICAgIHRlcm0uZnJlcXVlbmN5ID0gdGVybV9mcmVxKQoKICAgIHJldHVybihqc29uX2xkYSkKfQpgYGAKCmBgYHtyfQpqc29uX3JlcyA8LSB0b3BpY21vZGVsc19qc29uX2xkYXZpcyhsZGFfZml0LCBkdG0pCgpzZXJWaXMoanNvbl9yZXMpCmBgYAoKCgoKYGBge3J9CnRoZXRhIDwtIGFzLm1hdHJpeChwb3N0ZXJpb3IobGRhX2ZpdCkkdG9waWNzKQoKdGhldGEgPC0gYXNfdGliYmxlKHRoZXRhKQoKZGlzdF90b3BpY29zIDwtIGRmW3doaWNoKHJvd1RvdGFscz4wKSxdICU+JSAgI3RlbmdvIHF1ZSBlbGltaW5hciBlc2UgZG9jdWVtbnRvIHF1ZSBlc3RhYmEgdmFjaW8KICBzZWxlY3QoZmVjaGEsdGl0dWxvKSAlPiUgCiAgYmluZF9jb2xzKHRoZXRhKQoKYGBgCgpgYGB7cn0KCmRpc3RfdG9waWNvcyA8LSBkaXN0X3RvcGljb3MgJT4lIAogIHNlbGVjdCgtdGl0dWxvKSAlPiUgCiAgbXV0YXRlKGZlY2hhID0gZmxvb3JfZGF0ZShmZWNoYSwnbW9udGgnKSkgJT4lIAogIGdyb3VwX2J5KGZlY2hhKSAlPiUgCiAgc3VtbWFyaXNlX2FsbChtZWFuKQpgYGAKCmBgYHtyfQpkaXN0X3RvcGljb3MgJT4lIAogIGdhdGhlcih0b3BpY28scHJvcG9yY2lvbl9wcm9tZWRpbywgMjoxMSkgJT4lIAogIGdncGxvdCguLCBhZXMoZmVjaGEscHJvcG9yY2lvbl9wcm9tZWRpbywgY29sb3I9dG9waWNvKSkgKwogIGdlb21fbGluZSgpCmBgYAoK